################################################################################
# 05_secondary_regressions.R
# Secondary Event Study Regressions for JHR Paper
# "Teacher Testing Standards and the New Teacher Pipeline"
# Law, Marks, and Stern
#
# Implements event studies that use datasets beyond the main enrollment/graduation
# panels. These produce the data underlying Figures 5A-C, 6, and 7.
#
# Regressions:
#   - Placebo: Non-Education Enrollments (Figure 5A)
#   - Placebo: Non-Education Completions (Figure 5B)
#   - Placebo: Other Education Completions (Figure 5C)
#   - State Licenses Event Study (Figure 6)
#   - Teacher Shortages Event Study (Figure 7)
#
# Note: Converts Stata reghdfe + pairs cluster bootstrap to
#       R fixest::feols + manual pairs cluster bootstrap
################################################################################

library(tidyverse)
library(readxl)
library(haven)
library(fixest)
library(broom)

# ── Configuration ─────────────────────────────────────────────────────────────

USE_BOOTSTRAP <- FALSE  # Set to TRUE for publication-quality bootstrap SEs
BOOT_REPS <- 1000
BOOT_SEED <- 12345

out_dir <- "output/tables"
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

# ── Constants ─────────────────────────────────────────────────────────────────

EXCLUDE_STATES <- c("ND", "TN")
SC_POST2017_VALUE <- -0.6576

# Control variable definitions (same as 06_main_regressions.R)
state_demo <- c("real_income", "unemployment_rate")
state_kraft <- c("passevals", "implementevals", "eliminate_tenure",
                 "increase_probationary_period", "weaken_bargaining",
                 "eliminate_union_dues", "won_race_top", "common_core", "edtpa")
uni_controls <- c("test_optional", "satpct2", "actpct2",
                  "satvr25_2", "satvr75_2", "satmt25_2", "satmt75_2",
                  "actcm25_2", "actcm75_2",
                  "pell_percent2", "pell_amount2",
                  "loan_percent2", "loan_average2", "enrollment_total2")
all_controls <- c(state_demo, state_kraft, uni_controls)

# ── Event Study Helper ────────────────────────────────────────────────────────
# Reuses the same pairs cluster bootstrap pattern from 06_main_regressions.R

run_event_study <- function(df, outcome, year_vars, controls = NULL,
                            fe = "unitid_f + year_f", cluster_var = "State",
                            ref_year = 2012,
                            bootstrap = USE_BOOTSTRAP, B = BOOT_REPS,
                            seed = BOOT_SEED) {

  year_vars <- year_vars[year_vars %in% names(df)]
  valid_controls <- if (!is.null(controls)) controls[controls %in% names(df)] else NULL

  if (is.null(valid_controls) || length(valid_controls) == 0) {
    fml <- as.formula(paste0(outcome, " ~ ", paste(year_vars, collapse = " + "),
                              " | ", fe))
  } else {
    control_str <- paste(valid_controls, collapse = " + ")
    fml <- as.formula(paste0(outcome, " ~ ", paste(year_vars, collapse = " + "),
                              " + ", control_str, " | ", fe))
  }

  cluster_fml <- as.formula(paste0("~", cluster_var))
  mod <- feols(fml, data = df, cluster = cluster_fml)

  results <- tidy(mod, conf.int = TRUE) %>%
    filter(grepl("^year_", term)) %>%
    mutate(year = as.numeric(gsub("year_", "", term))) %>%
    select(year, estimate, std.error, conf.low, conf.high)

  if (bootstrap) {
    cat("    Running pairs cluster bootstrap (B=", B, ")...\n")
    set.seed(seed)
    clusters <- unique(df[[cluster_var]])
    G <- length(clusters)
    cluster_list <- split(df, df[[cluster_var]])

    boot_matrix <- matrix(NA, nrow = B, ncol = length(year_vars))
    colnames(boot_matrix) <- year_vars
    n_failures <- 0

    for (b in 1:B) {
      sampled <- sample(clusters, G, replace = TRUE)
      boot_data <- do.call(rbind, cluster_list[as.character(sampled)])
      rownames(boot_data) <- NULL

      tryCatch({
        boot_mod <- feols(fml, data = boot_data)
        bc <- coef(boot_mod)
        for (yv in year_vars) {
          if (yv %in% names(bc)) boot_matrix[b, yv] <- bc[yv]
        }
      }, error = function(e) {
        n_failures <<- n_failures + 1
      })
    }

    if (n_failures > 0) {
      cat(sprintf("    Note: %d/%d bootstrap reps failed\n", n_failures, B))
    }

    for (i in seq_len(nrow(results))) {
      yv <- paste0("year_", results$year[i])
      valid_coefs <- boot_matrix[, yv][!is.na(boot_matrix[, yv])]
      if (length(valid_coefs) > B * 0.5) {
        boot_se <- sd(valid_coefs)
        results$std.error[i] <- boot_se
        z_crit <- qnorm(0.975)
        results$conf.low[i] <- results$estimate[i] - z_crit * boot_se
        results$conf.high[i] <- results$estimate[i] + z_crit * boot_se
      }
    }
  }

  # Add reference year row
  results <- results %>%
    add_row(year = ref_year, estimate = 0, std.error = 0,
            conf.low = 0, conf.high = 0) %>%
    arrange(year)

  list(model = mod, results = results, n = mod$nobs,
       n_clusters = length(unique(df[[cluster_var]])))
}

# ══════════════════════════════════════════════════════════════════════════════
# PLACEBO EVENT STUDIES (Figures 5A-C)
# ══════════════════════════════════════════════════════════════════════════════

cat("\n", strrep("=", 60), "\n")
cat("PLACEBO EVENT STUDIES (Figures 5A-C)\n")
cat(strrep("=", 60), "\n")

# ── Load and Merge Graduation Placebo Data ───────────────────────────────────
# Stata: loads graduation_event_data.xlsx, merges 1:1 with placebo_data.dta

cat("Loading graduation data for placebo...\n")
grad_raw <- read_excel("data/raw/composite_treatment/graduation_event_data.xlsx")
placebo_grad <- read_stata("data/raw/placebo/placebo_data.dta")

# Merge graduation event data with placebo completions
grad_placebo <- grad_raw %>%
  inner_join(placebo_grad, by = c("unitid", "year")) %>%
  filter(!State %in% EXCLUDE_STATES) %>%
  # Fill forward test_index
  arrange(unitid, year) %>%
  group_by(unitid) %>%
  fill(test_index, .direction = "down") %>%
  ungroup() %>%
  # CT and SC adjustments
  mutate(test_index = if_else(State == "CT" & year > 2017, NA_real_, test_index)) %>%
  mutate(test_index = if_else(State == "SC" & year > 2017, SC_POST2017_VALUE, test_index)) %>%
  mutate(
    unitid_f = factor(unitid),
    year_f = factor(year),
    state_f = factor(State)
  )

cat("  Graduation placebo data:", nrow(grad_placebo), "obs,",
    n_distinct(grad_placebo$State), "states,",
    n_distinct(grad_placebo$unitid), "universities\n")

# ── Figure 5B: Non-Education Completions (l_all_other) ────────────────────────
cat("\n  Figure 5B: Non-Education Completions...\n")

grad_years <- paste0("year_", c(2009:2011, 2013:2020))

fig5b_es <- run_event_study(
  grad_placebo, "l_all_other", grad_years,
  controls = all_controls, fe = "unitid_f + year_f"
)

cat("    N =", fig5b_es$n, ", clusters =", fig5b_es$n_clusters, "\n")

# ── Figure 5C: Other Education Completions (l_nonkraft) ──────────────────────
cat("\n  Figure 5C: Other Education Completions...\n")

fig5c_es <- run_event_study(
  grad_placebo, "l_nonkraft", grad_years,
  controls = all_controls, fe = "unitid_f + year_f"
)

cat("    N =", fig5c_es$n, ", clusters =", fig5c_es$n_clusters, "\n")

# Save graduation placebo results (Figures 5B & 5C) in one CSV
# Match Stata output format: column 2 = Other Education, column 3 = Non-Education
grad_placebo_out <- fig5c_es$results %>%
  rename(other_ed_estimate = estimate, other_ed_se = std.error,
         other_ed_conf.low = conf.low, other_ed_conf.high = conf.high) %>%
  left_join(
    fig5b_es$results %>%
      rename(non_ed_estimate = estimate, non_ed_se = std.error,
             non_ed_conf.low = conf.low, non_ed_conf.high = conf.high),
    by = "year"
  )

write_csv(grad_placebo_out, file.path(out_dir, "placebo_event_study_total.csv"))
cat("  Saved: placebo_event_study_total.csv\n")

# Also save individual CSVs for direct use by figures script
write_csv(fig5b_es$results, file.path(out_dir, "placebo_non_ed_completions.csv"))
write_csv(fig5c_es$results, file.path(out_dir, "placebo_other_ed_completions.csv"))

# ── Load and Merge Enrollment Placebo Data ───────────────────────────────────
cat("\nLoading enrollment data for placebo...\n")
enroll_raw <- read_excel("data/raw/composite_treatment/enrollments_event_data.xlsx")
placebo_enroll <- read_stata("data/raw/placebo/placebo_enrollment_data.dta")

# Merge enrollment event data with placebo enrollments
enroll_placebo <- enroll_raw %>%
  inner_join(placebo_enroll %>% select(unitid, year, non_ed_enrollment, l_non_ed_enrollment),
             by = c("unitid", "year")) %>%
  filter(!State %in% EXCLUDE_STATES) %>%
  arrange(unitid, year) %>%
  group_by(unitid) %>%
  fill(test_index, .direction = "down") %>%
  ungroup() %>%
  mutate(test_index = if_else(State == "CT" & year > 2017, NA_real_, test_index)) %>%
  mutate(test_index = if_else(State == "SC" & year > 2017, SC_POST2017_VALUE, test_index)) %>%
  mutate(
    unitid_f = factor(unitid),
    year_f = factor(year),
    state_f = factor(State)
  )

cat("  Enrollment placebo data:", nrow(enroll_placebo), "obs,",
    n_distinct(enroll_placebo$State), "states,",
    n_distinct(enroll_placebo$unitid), "universities\n")

# ── Figure 5A: Non-Education Enrollments (biennial) ──────────────────────────
cat("\n  Figure 5A: Non-Education Enrollments...\n")

enroll_years <- paste0("year_", c(2008, 2010, 2014, 2016, 2018))

fig5a_es <- run_event_study(
  enroll_placebo, "l_non_ed_enrollment", enroll_years,
  controls = all_controls, fe = "unitid_f + year_f"
)

cat("    N =", fig5a_es$n, ", clusters =", fig5a_es$n_clusters, "\n")

write_csv(fig5a_es$results, file.path(out_dir, "enrollments_placebo_event_study_total.csv"))
cat("  Saved: enrollments_placebo_event_study_total.csv\n")

# Print placebo summary
cat("\n  Placebo Event Study Summary:\n")
cat("  Figure 5A (Non-Ed Enrollment, biennial):\n")
for (i in 1:nrow(fig5a_es$results)) {
  r <- fig5a_es$results[i, ]
  cat(sprintf("    %4d:  %7.3f (%5.3f)\n", r$year, r$estimate, r$std.error))
}
cat("  Figure 5B (Non-Ed Completions, annual):\n")
for (i in 1:nrow(fig5b_es$results)) {
  r <- fig5b_es$results[i, ]
  cat(sprintf("    %4d:  %7.3f (%5.3f)\n", r$year, r$estimate, r$std.error))
}
cat("  Figure 5C (Other Ed Completions, annual):\n")
for (i in 1:nrow(fig5c_es$results)) {
  r <- fig5c_es$results[i, ]
  cat(sprintf("    %4d:  %7.3f (%5.3f)\n", r$year, r$estimate, r$std.error))
}

# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 6: STATE LICENSES EVENT STUDY
# ══════════════════════════════════════════════════════════════════════════════

cat("\n", strrep("=", 60), "\n")
cat("FIGURE 6: STATE LICENSES EVENT STUDY\n")
cat(strrep("=", 60), "\n")

# Following state_license_regressions.do:
# 1. Load graduation_event_data.xlsx
# 2. Collapse to state-year level (sum ctotalt by state-year)
# 3. Apply test_index fixes (fill, CT, SC)
# 4. Drop ND, TN
# 5. Merge with stateyrlicensetradalt.dta on statename-year
# 6. Create log_licenses
# 7. Keep years 2010-2019
# 8. Run event study with state + year FE

cat("Loading and preparing license data...\n")

# Load state treatment data (already has controls and year dummies)
state_treat <- read_stata("data/raw/policy/state_treatment.dta")

# Load license counts
license_counts <- read_stata("data/raw/licenses/stateyrlicensetradalt.dta")

# Prepare state treatment: fill forward test_index, apply CT/SC fixes
state_treat <- state_treat %>%
  filter(!State %in% EXCLUDE_STATES) %>%
  arrange(State, year) %>%
  group_by(State) %>%
  fill(test_index, .direction = "down") %>%
  ungroup() %>%
  mutate(test_index = if_else(State == "CT" & year > 2017, NA_real_, test_index)) %>%
  mutate(test_index = if_else(State == "SC" & year > 2017, SC_POST2017_VALUE, test_index))

# Merge with license counts on statename + year
license_data <- state_treat %>%
  inner_join(license_counts, by = c("statename", "year")) %>%
  mutate(
    log_licenses = log(licenses),
    state_f = factor(State),
    year_f = factor(year)
  ) %>%
  # Match Stata: drop if year < 2010; drop if year > 2019
  filter(year >= 2010, year <= 2019)

cat("  License data:", nrow(license_data), "obs,",
    n_distinct(license_data$State), "states\n")
cat("  Year range:", min(license_data$year), "-", max(license_data$year), "\n")

# Run event study: state + year FE, state-level controls only
# Stata uses year_2010 through year_2019 (drops 2009 since sparse)
# Reference year = 2012
license_year_vars <- paste0("year_", c(2010, 2011, 2013:2019))

fig6_es <- run_event_study(
  license_data, "log_licenses", license_year_vars,
  controls = c(state_demo, state_kraft),
  fe = "state_f + year_f",
  cluster_var = "State"
)

cat("  N =", fig6_es$n, ", clusters =", fig6_es$n_clusters, "\n")

write_csv(fig6_es$results, file.path(out_dir, "state_licenses_event_study_coefficients.csv"))
cat("  Saved: state_licenses_event_study_coefficients.csv\n")

cat("\n  License Event Study Coefficients:\n")
for (i in 1:nrow(fig6_es$results)) {
  r <- fig6_es$results[i, ]
  cat(sprintf("    %4d:  %7.3f (%5.3f)\n", r$year, r$estimate, r$std.error))
}

# ══════════════════════════════════════════════════════════════════════════════
# FIGURE 7: TEACHER SHORTAGES EVENT STUDY
# ══════════════════════════════════════════════════════════════════════════════

cat("\n", strrep("=", 60), "\n")
cat("FIGURE 7: TEACHER SHORTAGES EVENT STUDY\n")
cat(strrep("=", 60), "\n")

# Following shortage_regerssions.do:
# 1. Load state_treatment.dta
# 2. Apply test_index fixes
# 3. Merge 1:1 with total_shortages_state_year.dta on statename + year
# 4. Keep years 2009-2020
# 5. Run event study: ln_shortages ~ year_dummies + controls, absorb(State year)
# Figure 7 uses column 3 = "All States (log)" = ln_shortages

cat("Loading and preparing shortage data...\n")

shortage_raw <- read_stata("data/raw/shortages/total_shortages_state_year.dta")

# Merge state treatment with shortage data
shortage_data <- state_treat %>%
  inner_join(shortage_raw, by = c("statename", "year")) %>%
  filter(year >= 2009, year <= 2020) %>%
  mutate(
    state_f = factor(State),
    year_f = factor(year)
  )

cat("  Shortage data:", nrow(shortage_data), "obs,",
    n_distinct(shortage_data$State), "states\n")
cat("  Year range:", min(shortage_data$year), "-", max(shortage_data$year), "\n")

# Run event study for log shortages (Figure 7 = column 3 in Stata output)
shortage_year_vars <- paste0("year_", c(2009:2011, 2013:2020))

fig7_es <- run_event_study(
  shortage_data %>% filter(!is.na(ln_shortages)),
  "ln_shortages", shortage_year_vars,
  controls = c(state_demo, state_kraft),
  fe = "state_f + year_f",
  cluster_var = "State"
)

cat("  N =", fig7_es$n, ", clusters =", fig7_es$n_clusters, "\n")

write_csv(fig7_es$results, file.path(out_dir, "teacher_shortage_event_study.csv"))
cat("  Saved: teacher_shortage_event_study.csv\n")

cat("\n  Shortage Event Study Coefficients (log):\n")
for (i in 1:nrow(fig7_es$results)) {
  r <- fig7_es$results[i, ]
  cat(sprintf("    %4d:  %7.3f (%5.3f)\n", r$year, r$estimate, r$std.error))
}

# ══════════════════════════════════════════════════════════════════════════════
# Summary
# ══════════════════════════════════════════════════════════════════════════════

cat("\n", strrep("=", 60), "\n")
cat("SECONDARY REGRESSIONS SUMMARY\n")
cat(strrep("=", 60), "\n")

cat("\nOutput files:\n")
cat("  ", file.path(out_dir, "enrollments_placebo_event_study_total.csv"), "\n")
cat("  ", file.path(out_dir, "placebo_event_study_total.csv"), "\n")
cat("  ", file.path(out_dir, "placebo_non_ed_completions.csv"), "\n")
cat("  ", file.path(out_dir, "placebo_other_ed_completions.csv"), "\n")
cat("  ", file.path(out_dir, "state_licenses_event_study_coefficients.csv"), "\n")
cat("  ", file.path(out_dir, "teacher_shortage_event_study.csv"), "\n")

cat("\nSample sizes:\n")
cat(sprintf("  Fig 5A (Non-Ed Enrollments): N=%d, %d clusters\n",
            fig5a_es$n, fig5a_es$n_clusters))
cat(sprintf("  Fig 5B (Non-Ed Completions): N=%d, %d clusters\n",
            fig5b_es$n, fig5b_es$n_clusters))
cat(sprintf("  Fig 5C (Other Ed Completions): N=%d, %d clusters\n",
            fig5c_es$n, fig5c_es$n_clusters))
cat(sprintf("  Fig 6  (State Licenses): N=%d, %d clusters\n",
            fig6_es$n, fig6_es$n_clusters))
cat(sprintf("  Fig 7  (Teacher Shortages): N=%d, %d clusters\n",
            fig7_es$n, fig7_es$n_clusters))
cat("\n")
